package us.codecraft.demo.pipeline;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;

import us.codecraft.demo.model.HotelJiangSu;
import us.codecraft.demo.service.HotelService;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

public class XiechengProcessor implements PageProcessor {
	private ApplicationContext context;
	 private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(3000)
	            .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36");
	
	private static final String prefix = "http://hotels.ctrip.com/hotel/";
	 private static final String suffix="#ctm_ref=hod_hp_sb_lst";
	 public static final  Map<String,String> cache=new  HashMap<String,String>();//存放酒店最低价
	 public XiechengProcessor() {
	        context = new ClassPathXmlApplicationContext("classpath:applicationContext.xml");
	    }
	 @Autowired
	 private HotelService hotelService;
	 public Site getSite() {
		 return site;
	}

	public void process(Page page) {
		List<String> citys=page.getHtml().regex("City\\|[\u4e00-\u9fa5]+\\|\\d+").all();//正则获取省直辖市
		if(!citys.isEmpty())
		{
			 List<String> cityList=new ArrayList<String>();
			 List<String> urlList=new ArrayList<String>();
			for (String string : citys) {
				System.out.println(string);
				HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
				 defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);// 输出拼音全部小写
				 defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);// 不带声调
				defaultFormat.setVCharType(HanyuPinyinVCharType.WITH_V) ;
				String pinyinAll="";
				 String [] arry=string.split("\\|");
				 char[] cl_chars = arry[1].trim().toCharArray();
				 for (char c : cl_chars) {
					 try {
						 pinyinAll+=PinyinHelper.toHanyuPinyinStringArray(c, defaultFormat)[0];
						
					} catch (BadHanyuPinyinOutputFormatCombination e) {
						e.printStackTrace();
					}
				}
				 cityList.add(pinyinAll+arry[2]);
			}
			for (String cityPinYin : cityList) {
				String url=prefix+cityPinYin+suffix;
				urlList.add(url);
			}
			page.addTargetRequests(urlList);//将直辖市url放入队列中
		}else if(("1").equals(page.getHtml().xpath("//div[@class='page_box']//div[@class='c_page_list layoutfix']//a[@class='current']/text()").toString()))
		{
			//String hostDiv=page.getHtml().xpath("//div[@id='hotel_list']").toString();
			
			String pageSize=page.getHtml().xpath("//div[@class='page_box']//div[@class='c_page_list layoutfix']//a[8]/text()").toString();
			String pageUrl=page.getHtml().xpath("//div[@class='page_box']//div[@class='c_page_list layoutfix']//a[2]/@href").toString();
			//String pageUrl="http://hotels.ctrip.com/hotel/suzhou14/p2";
			pageUrl=pageUrl.substring(0, pageUrl.length()-1);
			if(StringUtils.isNotBlank(pageSize))
			{
				int size=Integer.parseInt(pageSize);
				List<String> pageUrlList=new ArrayList<String>();
				for (int i = 2; i <=size; i++) 
				{
					String url="";
					url=pageUrl+i;
					pageUrlList.add(url);
				}
				page.addTargetRequests(pageUrlList);//将分页的所有链接放入队列
			}
			List<String> priceList=page.getHtml().xpath("//div[@id='hotel_list']//div[@class='hotel_new_list']//ul//div[@class='hotel_price']//a").all();
			for (String string : priceList) {
				Document document= Jsoup.parse(string);
				Elements element=document.getElementsByTag("a");
				String href=element.get(0).attr("href");
				String price=element.get(0).child(0).text();
				System.out.println(href+"   "+price);
				cache.put(href, price);
			}
			page.addTargetRequests(page.getHtml().xpath("//div[@id='hotel_list']//h2[@class='hotel_name']//a").links().all());
			System.out.println(pageUrl);
		}else if(page.getHtml().xpath("//div[@id='hotel_list']").match())
		{
			page.addTargetRequests(page.getHtml().xpath("//div[@id='hotel_list']//h2[@class='hotel_name']//a").links().all());
			List<String> priceList=page.getHtml().xpath("//div[@id='hotel_list']//div[@class='hotel_new_list']//ul//div[@class='hotel_price']//a").all();
			for (String string : priceList) {
				Document document= Jsoup.parse(string);
				Elements element=document.getElementsByTag("a");
				String href=element.get(0).attr("href");
				String price=element.get(0).child(0).text();
				System.out.println(href+"   "+price);
				cache.put(href, price);
			}
		}
		if(page.getHtml().xpath("//div[@class='htl_info']").match())
		{
			HotelJiangSu hotelJiangSu=new HotelJiangSu();
			System.out.println("酒店名称："+page.getHtml().xpath("//div[@class='htl_info']//div[@class='name']//h2[1]/text()").toString());
			hotelJiangSu.setName(page.getHtml().xpath("//div[@class='htl_info']//div[@class='name']//h2[1]/text()").toString());
			
			System.out.println("链接地址："+page.getUrl());
			hotelJiangSu.setUrl(page.getUrl().toString());
			System.out.println("酒店最低价："+cache.get(page.getUrl().toString()));
			hotelJiangSu.setPrice(cache.get(page.getUrl().toString()));
			System.out.println("城市："+page.getHtml().xpath("//div[@class='adress']//span[1]/text()").toString());
			hotelJiangSu.setCity(page.getHtml().xpath("//div[@class='adress']//span[1]/text()").toString());
			
			System.out.println("区/县："+page.getHtml().xpath("//div[@class='adress']//span[2]/text()").toString());
			hotelJiangSu.setCityArea(page.getHtml().xpath("//div[@class='adress']//span[2]/text()").toString());
			
			System.out.println("地址："+page.getHtml().xpath("//div[@class='adress']//span[3]/text()").toString());
			hotelJiangSu.setAddress(page.getHtml().xpath("//div[@class='adress']//span[3]/text()").toString());
			
			System.out.println("附近坐标："+page.getHtml().xpath("//div[@class='adress']//span[4]/text()").toString());
			hotelJiangSu.setCoordinate(page.getHtml().xpath("//div[@class='adress']//span[4]/text()").toString());
			
			//System.out.println("最低价格："+page.getHtml().xpath("//div[@class='htl_info']//div[@id='div_minprice']").toString());
			String hotelSuggest=page.getHtml().xpath("//div[@class='hotel_info_comment detail_content']//div[@id='htlDes']//p/text()").toString();
			System.out.println("酒店开业时间："+page.getHtml().xpath("//div[@class='hotel_info_comment detail_content']//div[@id='htlDes']//p/text()").regex("\\d+年开业"));
			System.out.println("酒店装修时间："+page.getHtml().xpath("//div[@class='hotel_info_comment detail_content']//div[@id='htlDes']//p/text()").regex("\\d+年装修"));
			System.out.println("酒店房间数："+page.getHtml().xpath("//div[@class='hotel_info_comment detail_content']//div[@id='htlDes']//p/text()").regex("\\d+间房"));
			
			hotelJiangSu.setContact(hotelSuggest);
			
			System.out.println("酒店评分："+page.getHtml().xpath("//div[@class='htl_com_box basefix']//a//p//span/text()").toString());
			hotelJiangSu.setGrade(page.getHtml().xpath("//div[@class='htl_com_box basefix']//a//p//span/text()").toString());
			
			System.out.println("联系电话："+page.getHtml().xpath("//div[@class='hotel_info_comment detail_content']//div[@id='htlDes']//p//span/@data-real").regex("电话\\d+\\-\\d+"));
			System.out.println("传真："+page.getHtml().xpath("//div[@class='hotel_info_comment detail_content']//div[@id='htlDes']//p//span/@data-real").regex("传真\\d+\\-\\d+"));
			
			hotelJiangSu.setIntroduce(page.getHtml().xpath("//div[@class='hotel_info_comment detail_content']//div[@id='htlDes']//p//span/@data-real").toString());
			
			hotelService=(HotelService) context.getBean("hotelService");
		
			hotelService.insert(hotelJiangSu);
		}
		
	}

	public static void main(String[] args) {
		Spider.create(new XiechengProcessor())  
        // 从"http://baozoumanhua.com/text"开始抓  
        .addUrl("http://hotels.ctrip.com/Domestic/Tool/AjaxDestination.aspx?keyword=%25u6C5F%25u82CF&from=domestic")  
        // 开启5个线程抓取  
        .thread(5)  
        // 启动爬虫  
        .run();  
	}

}
